Importing Libraries¶
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#pyclustering
from pyclustering.cluster.kmedians import kmedians
from pyclustering.cluster import cluster_visualizer
from pyclustering.cluster.center_initializer import random_center_initializer
#scipy
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import ward, fcluster
#sklearn
import sklearn
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm, tree, neighbors
from sklearn import naive_bayes, ensemble
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
#warnings
import warnings
warnings.filterwarnings('ignore')
Data¶
In [2]:
#import data
airline = pd.read_csv('/Users/meghananekkanti/Desktop/Spring /DNSC_6315_ML2/Project/Airline_Passenger.csv')
airline.head()
Out[2]:
| Unnamed: 0 | id | Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 19556 | Female | Loyal Customer | 52 | Business travel | Eco | 160 | 5 | 4 | ... | 5 | 5 | 5 | 5 | 2 | 5 | 5 | 50 | 44.0 | satisfied |
| 1 | 1 | 90035 | Female | Loyal Customer | 36 | Business travel | Business | 2863 | 1 | 1 | ... | 4 | 4 | 4 | 4 | 3 | 4 | 5 | 0 | 0.0 | satisfied |
| 2 | 2 | 12360 | Male | disloyal Customer | 20 | Business travel | Eco | 192 | 2 | 0 | ... | 2 | 4 | 1 | 3 | 2 | 2 | 2 | 0 | 0.0 | neutral or dissatisfied |
| 3 | 3 | 77959 | Male | Loyal Customer | 44 | Business travel | Business | 3377 | 0 | 0 | ... | 1 | 1 | 1 | 1 | 3 | 1 | 4 | 0 | 6.0 | satisfied |
| 4 | 4 | 36875 | Female | Loyal Customer | 49 | Business travel | Eco | 1182 | 2 | 3 | ... | 2 | 2 | 2 | 2 | 4 | 2 | 4 | 0 | 20.0 | satisfied |
5 rows × 25 columns
In [3]:
#check columns
airline.columns
Out[3]:
Index(['Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
'Class', 'Flight Distance', 'Inflight wifi service',
'Departure/Arrival time convenient', 'Ease of Online booking',
'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
'Inflight entertainment', 'On-board service', 'Leg room service',
'Baggage handling', 'Checkin service', 'Inflight service',
'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
'satisfaction'],
dtype='object')
In [4]:
airline['satisfaction'].value_counts()
Out[4]:
satisfaction neutral or dissatisfied 73452 satisfied 56428 Name: count, dtype: int64
In [5]:
#shape
airline.shape
Out[5]:
(129880, 25)
Cleaning data¶
In [6]:
#removing unnamed column
airline = airline.drop(columns = ['Unnamed: 0', 'id']) #since not used
- data is very clean
- not checking for null values , because Arrival Delay has 0 values = no delay
Dealing with missing values¶
In [7]:
#checking for missing values
airline.isna().sum()
Out[7]:
Gender 0 Customer Type 0 Age 0 Type of Travel 0 Class 0 Flight Distance 0 Inflight wifi service 0 Departure/Arrival time convenient 0 Ease of Online booking 0 Gate location 0 Food and drink 0 Online boarding 0 Seat comfort 0 Inflight entertainment 0 On-board service 0 Leg room service 0 Baggage handling 0 Checkin service 0 Inflight service 0 Cleanliness 0 Departure Delay in Minutes 0 Arrival Delay in Minutes 393 satisfaction 0 dtype: int64
In [8]:
#in arrival Delay in Minutes na values = 0, meaning no delay
#fill with 0
airline['Arrival Delay in Minutes'].fillna(0, inplace = True)
#check again
airline.isna().sum()
Out[8]:
Gender 0 Customer Type 0 Age 0 Type of Travel 0 Class 0 Flight Distance 0 Inflight wifi service 0 Departure/Arrival time convenient 0 Ease of Online booking 0 Gate location 0 Food and drink 0 Online boarding 0 Seat comfort 0 Inflight entertainment 0 On-board service 0 Leg room service 0 Baggage handling 0 Checkin service 0 Inflight service 0 Cleanliness 0 Departure Delay in Minutes 0 Arrival Delay in Minutes 0 satisfaction 0 dtype: int64
Converting categorical data¶
In [9]:
#convert Gender, Customer Type, Type of Travel, Class to categorical
airline['Gender'] = airline['Gender'].astype('category')
airline['Customer Type'] = airline['Customer Type'].astype('category')
airline['Type of Travel'] = airline['Type of Travel'].astype('category')
airline['satisfaction'] = airline['satisfaction'].astype('category')
airline.dtypes
Out[9]:
Gender category Customer Type category Age int64 Type of Travel category Class object Flight Distance int64 Inflight wifi service int64 Departure/Arrival time convenient int64 Ease of Online booking int64 Gate location int64 Food and drink int64 Online boarding int64 Seat comfort int64 Inflight entertainment int64 On-board service int64 Leg room service int64 Baggage handling int64 Checkin service int64 Inflight service int64 Cleanliness int64 Departure Delay in Minutes int64 Arrival Delay in Minutes float64 satisfaction category dtype: object
In [10]:
#check for unique
print(airline['Gender'].unique())
print(airline['Customer Type'].unique())
print(airline['Type of Travel'].unique())
print(airline['Class'].unique())
print(airline['satisfaction'].unique())
['Female', 'Male'] Categories (2, object): ['Female', 'Male'] ['Loyal Customer', 'disloyal Customer'] Categories (2, object): ['Loyal Customer', 'disloyal Customer'] ['Business travel', 'Personal Travel'] Categories (2, object): ['Business travel', 'Personal Travel'] ['Eco' 'Business' 'Eco Plus'] ['satisfied', 'neutral or dissatisfied'] Categories (2, object): ['neutral or dissatisfied', 'satisfied']
In [11]:
#formatting them
categorical_columns = airline.select_dtypes(['category']).columns
categorical_columns
airline[categorical_columns] = airline[categorical_columns].apply(lambda x: x.cat.codes)
In [12]:
#formatting class 0-2
category_mapping = {'Eco': 0, 'Eco Plus': 1, 'Business': 2}
airline['Class'] = airline['Class'].map(category_mapping)
In [13]:
#checking
print(airline['Gender'].unique())
print(airline['Customer Type'].unique())
print(airline['Type of Travel'].unique())
print(airline['Class'].unique())
print(airline['satisfaction'].unique())
[0 1] [0 1] [0 1] [0 2 1] [1 0]
In [14]:
airline.head()
Out[14]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 52 | 0 | 0 | 160 | 5 | 4 | 3 | 4 | ... | 5 | 5 | 5 | 5 | 2 | 5 | 5 | 50 | 44.0 | 1 |
| 1 | 0 | 0 | 36 | 0 | 2 | 2863 | 1 | 1 | 3 | 1 | ... | 4 | 4 | 4 | 4 | 3 | 4 | 5 | 0 | 0.0 | 1 |
| 2 | 1 | 1 | 20 | 0 | 0 | 192 | 2 | 0 | 2 | 4 | ... | 2 | 4 | 1 | 3 | 2 | 2 | 2 | 0 | 0.0 | 0 |
| 3 | 1 | 0 | 44 | 0 | 2 | 3377 | 0 | 0 | 0 | 2 | ... | 1 | 1 | 1 | 1 | 3 | 1 | 4 | 0 | 6.0 | 1 |
| 4 | 0 | 0 | 49 | 0 | 0 | 1182 | 2 | 3 | 4 | 3 | ... | 2 | 2 | 2 | 2 | 4 | 2 | 4 | 0 | 20.0 | 1 |
5 rows × 23 columns
EDA¶
Distribution of age¶
In [15]:
plt.figure(figsize = (8, 6))
sns.histplot(airline['Age'], bins = 20, kde = True, color = 'purple')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()
Count of Gender¶
In [16]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Gender', data=airline, palette='icefire')
plt.title('Count of Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Female', 'Male'])
plt.show()
Age by Class & Gender¶
In [17]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Class', y='Age', hue='Gender', data=airline, palette = 'PuBuGn')
plt.title('Age by Class and Gender: Female:0, Male:1')
plt.xlabel('Class')
plt.ylabel('Age')
plt.show()
Count of Customer Type¶
In [18]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Customer Type', data=airline, palette='cividis')
plt.title('Count of Customer Type')
plt.xlabel('Customer Type')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Loyal', 'Disloyal'])
plt.show()
In [ ]:
In [ ]:
Correlation¶
In [19]:
correlation_matrix = airline.corr()
#plot
plt.figure(figsize = (25, 25))
sns.heatmap(correlation_matrix, cmap = 'magma')
plt.title('Correlation Matrix')
plt.show()
#correlation matrix
In [ ]:
In [ ]:
Overall model comparision for the entire data¶
Test-Train Split (40:60)¶
In [20]:
X = airline.drop(['satisfaction'],axis = 1)
y = airline['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Logistic Classifier¶
In [21]:
#logistic regression
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
log_model.fit(X_train, y_train)
#calculate accuracy
y_pred_log = log_model.predict(X_test)
accuracy_log = accuracy_score(y_test, y_pred_log)
print("Accuracy of Logistic Regression:", accuracy_log)
Accuracy of Logistic Regression: 0.8049738219895288
Decision Tree Classifier¶
In [22]:
#check which depth to use
scores_list = []
depth_list = np.arange(1,20,1)
for depth in depth_list:
dt = DecisionTreeClassifier(max_depth = depth, criterion = 'gini', random_state = 0)
scores = cross_val_score(dt, X_train, y_train, cv = 10,scoring = 'accuracy')
scores_list.append(scores.mean())
#plot
plt.plot(depth_list, scores_list, color = 'purple', markerfacecolor = 'black',label = 'Score')
plt.title('Accuracy Score vs max_depth')
plt.show()
In [23]:
# for max accuracy depth
max_value = max(scores_list)
max_index = scores_list.index(max_value)
max_index
Out[23]:
13
In [24]:
dt = DecisionTreeClassifier(max_depth = 13, criterion = "gini", random_state = 0)
dt_model = dt.fit(X_train, y_train)
#calculate accuracy
y_pred_dt = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy of Decision Tree:", accuracy_dt)
Accuracy of Decision Tree: 0.9531683092085002
Bagging Classifier¶
In [25]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0)
clf_bag.fit(X_train_scaled, y_train)
y_pred_bag = clf_bag.predict(X_test_scaled)
#calculate accuracy
accuracy_bag = accuracy_score(y_test, y_pred_bag)
print("Accuracy of Bagging Classifier:", accuracy_bag)
Accuracy of Bagging Classifier: 0.9584616569140745
Random Forest Classifier¶
In [26]:
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42)
rand_forest_model.fit(X_train, y_train)
#calculate accuracy
y_pred_rand_forest = rand_forest_model.predict(X_test)
accuracy_rand_forest = accuracy_score(y_test, y_pred_rand_forest)
print("Accuracy of Random Forest:", accuracy_rand_forest)
Accuracy of Random Forest: 0.9622343701878657
GB Classifier¶
In [27]:
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42)
gbc.fit(X_train, y_train)
#calculate accuracy
y_pred_gbc = gbc.predict(X_test)
accuracy_gbc = accuracy_score(y_test, y_pred_gbc)
print("Accuracy of GB Classifier:", accuracy_gbc)
Accuracy of GB Classifier: 0.943505543578688
KNN Classifier¶
In [28]:
#k-means
cluster_range = range(2,50)
cluster_wss = []
for num_cluster in cluster_range:
clusters = KMeans(num_cluster)
clusters.fit(X_train)
cluster_wss.append(clusters.inertia_)
plt.xlabel('# Clusters')
plt.ylabel('WSS')
plt.plot(cluster_range, cluster_wss, marker = 'o', color = 'purple')
plt.show()
In [29]:
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X, y)
#calculate accuracy
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy of KNN:", accuracy_knn)
Accuracy of KNN: 0.8247420696027102
Comparing all the models on overall data¶
In [30]:
#to keep decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state =0).fit(X_train, y_train)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[30]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 3 | Random Forest Classifier | 99.31 | 0.05 | 95.96 | 0.11 |
| 2 | Bagging Classifier | 98.91 | 0.10 | 95.53 | 0.12 |
| 4 | Gradient Boosting Classifier | 98.77 | 0.07 | 94.17 | 0.19 |
| 1 | Decision Tree Classifier | 96.80 | 0.07 | 94.75 | 0.17 |
| 0 | Logistic Regression | 85.97 | 1.06 | 79.69 | 1.10 |
| 5 | KNN | 78.88 | 0.13 | 73.37 | 0.13 |
Best model for overall data = Random Forest Classifier¶
In [ ]:
In [ ]:
Subsetting data by Class¶
Business Class¶
In [31]:
business = airline[airline['Class'] == 2]
business.head()
Out[31]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0 | 36 | 0 | 2 | 2863 | 1 | 1 | 3 | 1 | ... | 4 | 4 | 4 | 4 | 3 | 4 | 5 | 0 | 0.0 | 1 |
| 3 | 1 | 0 | 44 | 0 | 2 | 3377 | 0 | 0 | 0 | 2 | ... | 1 | 1 | 1 | 1 | 3 | 1 | 4 | 0 | 6.0 | 1 |
| 6 | 0 | 0 | 77 | 0 | 2 | 3987 | 5 | 5 | 5 | 5 | ... | 5 | 5 | 5 | 5 | 4 | 5 | 3 | 0 | 0.0 | 1 |
| 7 | 0 | 0 | 43 | 0 | 2 | 2556 | 2 | 2 | 2 | 2 | ... | 4 | 4 | 4 | 4 | 5 | 4 | 3 | 77 | 65.0 | 1 |
| 9 | 0 | 0 | 46 | 0 | 2 | 1744 | 2 | 2 | 2 | 2 | ... | 4 | 4 | 4 | 4 | 5 | 4 | 4 | 28 | 14.0 | 1 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [32]:
X = business.drop(['satisfaction'], axis = 1)
y = business['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [33]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion = 'gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[33]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 3 | Random Forest Classifier | 99.55 | 0.02 | 97.19 | 0.09 |
| 2 | Bagging Classifier | 99.25 | 0.07 | 96.79 | 0.15 |
| 4 | Gradient Boosting Classifier | 99.09 | 0.05 | 95.53 | 0.24 |
| 1 | Decision Tree Classifier | 96.46 | 0.63 | 95.77 | 0.18 |
| 0 | Logistic Regression | 88.19 | 0.89 | 81.67 | 0.62 |
| 5 | KNN | 73.12 | 0.20 | 72.60 | 0.29 |
Best model for Business class data = Random Forest Classifier¶
Confusion Matrix¶
In [34]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare')
plt.show()
Economy Class¶
In [35]:
economy = airline[airline['Class'] == 0]
economy.head()
Out[35]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 52 | 0 | 0 | 160 | 5 | 4 | 3 | 4 | ... | 5 | 5 | 5 | 5 | 2 | 5 | 5 | 50 | 44.0 | 1 |
| 2 | 1 | 1 | 20 | 0 | 0 | 192 | 2 | 0 | 2 | 4 | ... | 2 | 4 | 1 | 3 | 2 | 2 | 2 | 0 | 0.0 | 0 |
| 4 | 0 | 0 | 49 | 0 | 0 | 1182 | 2 | 3 | 4 | 3 | ... | 2 | 2 | 2 | 2 | 4 | 2 | 4 | 0 | 20.0 | 1 |
| 5 | 1 | 0 | 16 | 0 | 0 | 311 | 3 | 3 | 3 | 3 | ... | 5 | 4 | 3 | 1 | 1 | 2 | 5 | 0 | 0.0 | 1 |
| 8 | 1 | 0 | 47 | 0 | 0 | 556 | 5 | 2 | 2 | 2 | ... | 5 | 2 | 2 | 5 | 3 | 3 | 5 | 1 | 0.0 | 1 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [36]:
X = economy.drop(['satisfaction'], axis = 1)
y = economy['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [37]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[37]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 3 | Random Forest Classifier | 98.12 | 0.11 | 94.51 | 0.17 |
| 4 | Gradient Boosting Classifier | 98.03 | 0.05 | 94.03 | 0.27 |
| 2 | Bagging Classifier | 97.35 | 0.08 | 94.56 | 0.14 |
| 1 | Decision Tree Classifier | 95.85 | 0.28 | 94.13 | 0.14 |
| 0 | Logistic Regression | 81.10 | 0.44 | 86.33 | 0.58 |
| 5 | KNN | 66.12 | 0.30 | 81.80 | 0.21 |
Best model for Economy Class data = Random Forest Classifier¶
Confusion Matrix¶
In [38]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare')
plt.show()
Economy Plus Class¶
In [39]:
economy_plus = airline[airline['Class'] == 1]
economy_plus.head()
Out[39]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 17 | 1 | 0 | 52 | 1 | 1 | 1075 | 5 | 4 | 5 | 3 | ... | 4 | 3 | 5 | 5 | 4 | 5 | 4 | 0 | 0.0 | 1 |
| 44 | 0 | 0 | 36 | 0 | 1 | 267 | 2 | 3 | 3 | 3 | ... | 2 | 1 | 1 | 3 | 4 | 4 | 2 | 59 | 55.0 | 0 |
| 50 | 0 | 0 | 58 | 1 | 1 | 990 | 3 | 5 | 2 | 3 | ... | 1 | 1 | 2 | 1 | 5 | 1 | 3 | 0 | 0.0 | 0 |
| 65 | 0 | 0 | 53 | 1 | 1 | 964 | 1 | 5 | 1 | 2 | ... | 4 | 4 | 1 | 4 | 5 | 4 | 4 | 6 | 0.0 | 0 |
| 74 | 0 | 1 | 26 | 0 | 1 | 913 | 1 | 2 | 2 | 2 | ... | 4 | 3 | 2 | 1 | 1 | 4 | 4 | 0 | 0.0 | 0 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [40]:
X = economy_plus.drop(['satisfaction'], axis = 1)
y = economy_plus['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [41]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[41]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 4 | Gradient Boosting Classifier | 98.22 | 0.14 | 94.01 | 0.11 |
| 3 | Random Forest Classifier | 97.97 | 0.03 | 93.66 | 0.35 |
| 2 | Bagging Classifier | 97.01 | 0.17 | 93.85 | 0.22 |
| 1 | Decision Tree Classifier | 95.53 | 0.41 | 92.99 | 0.41 |
| 0 | Logistic Regression | 84.19 | 0.89 | 84.77 | 0.73 |
| 5 | KNN | 60.12 | 0.65 | 73.34 | 0.30 |
Best model for Economy Plus Class data = Gradient Boosting Classifier¶
Confusion Matrix¶
In [42]:
y_pred_gbc = gbc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gbc, labels = gbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = gbc.classes_)
disp.plot(cmap = 'flare')
plt.show()
In [ ]:
In [ ]:
Subsetting data by Gender¶
Female data¶
In [43]:
female = airline[airline['Gender'] == 0]
female.head()
Out[43]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 52 | 0 | 0 | 160 | 5 | 4 | 3 | 4 | ... | 5 | 5 | 5 | 5 | 2 | 5 | 5 | 50 | 44.0 | 1 |
| 1 | 0 | 0 | 36 | 0 | 2 | 2863 | 1 | 1 | 3 | 1 | ... | 4 | 4 | 4 | 4 | 3 | 4 | 5 | 0 | 0.0 | 1 |
| 4 | 0 | 0 | 49 | 0 | 0 | 1182 | 2 | 3 | 4 | 3 | ... | 2 | 2 | 2 | 2 | 4 | 2 | 4 | 0 | 20.0 | 1 |
| 6 | 0 | 0 | 77 | 0 | 2 | 3987 | 5 | 5 | 5 | 5 | ... | 5 | 5 | 5 | 5 | 4 | 5 | 3 | 0 | 0.0 | 1 |
| 7 | 0 | 0 | 43 | 0 | 2 | 2556 | 2 | 2 | 2 | 2 | ... | 4 | 4 | 4 | 4 | 5 | 4 | 3 | 77 | 65.0 | 1 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [44]:
X = female.drop(['satisfaction'], axis = 1)
y = female['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [45]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[45]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 3 | Random Forest Classifier | 99.23 | 0.03 | 95.85 | 0.16 |
| 2 | Bagging Classifier | 98.85 | 0.09 | 95.58 | 0.20 |
| 4 | Gradient Boosting Classifier | 98.78 | 0.05 | 94.14 | 0.05 |
| 1 | Decision Tree Classifier | 96.65 | 0.11 | 94.68 | 0.08 |
| 0 | Logistic Regression | 86.07 | 0.23 | 80.57 | 0.34 |
| 5 | KNN | 76.36 | 0.54 | 71.54 | 0.39 |
Best model for Female data = Random Forest Classifier¶
Confusion Matrix¶
In [46]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare')
plt.show()
Male data¶
In [47]:
male = airline[airline['Gender'] == 1]
male.head()
Out[47]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 1 | 1 | 20 | 0 | 0 | 192 | 2 | 0 | 2 | 4 | ... | 2 | 4 | 1 | 3 | 2 | 2 | 2 | 0 | 0.0 | 0 |
| 3 | 1 | 0 | 44 | 0 | 2 | 3377 | 0 | 0 | 0 | 2 | ... | 1 | 1 | 1 | 1 | 3 | 1 | 4 | 0 | 6.0 | 1 |
| 5 | 1 | 0 | 16 | 0 | 0 | 311 | 3 | 3 | 3 | 3 | ... | 5 | 4 | 3 | 1 | 1 | 2 | 5 | 0 | 0.0 | 1 |
| 8 | 1 | 0 | 47 | 0 | 0 | 556 | 5 | 2 | 2 | 2 | ... | 5 | 2 | 2 | 5 | 3 | 3 | 5 | 1 | 0.0 | 1 |
| 15 | 1 | 0 | 50 | 1 | 0 | 83 | 3 | 4 | 0 | 3 | ... | 2 | 4 | 2 | 4 | 4 | 5 | 2 | 5 | 2.0 | 0 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [48]:
X = male.drop(['satisfaction'], axis = 1)
y = male['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [49]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[49]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 3 | Random Forest Classifier | 99.22 | 0.03 | 95.69 | 0.10 |
| 2 | Bagging Classifier | 98.83 | 0.06 | 95.48 | 0.12 |
| 4 | Gradient Boosting Classifier | 98.80 | 0.02 | 94.22 | 0.04 |
| 1 | Decision Tree Classifier | 96.32 | 0.20 | 94.36 | 0.17 |
| 0 | Logistic Regression | 87.99 | 0.51 | 81.46 | 0.85 |
| 5 | KNN | 76.63 | 0.11 | 71.47 | 0.10 |
Best model for Male data = Random Forest Classifier¶
Confusion Matrix¶
In [50]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare')
plt.show()
In [ ]:
In [ ]:
Subsetting data by Age¶
Age group 1: 6 - 18¶
In [51]:
age_1 = airline[(airline['Age'] > 6) & (airline['Age'] <= 18)]
age_1.head()
Out[51]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 1 | 0 | 16 | 0 | 0 | 311 | 3 | 3 | 3 | 3 | ... | 5 | 4 | 3 | 1 | 1 | 2 | 5 | 0 | 0.0 | 1 |
| 35 | 1 | 0 | 12 | 1 | 0 | 674 | 3 | 4 | 3 | 1 | ... | 1 | 3 | 3 | 5 | 4 | 4 | 1 | 80 | 70.0 | 0 |
| 108 | 0 | 0 | 7 | 1 | 0 | 1120 | 3 | 5 | 3 | 3 | ... | 2 | 4 | 4 | 4 | 5 | 5 | 2 | 16 | 13.0 | 0 |
| 130 | 0 | 0 | 17 | 1 | 0 | 89 | 3 | 4 | 3 | 1 | ... | 5 | 3 | 4 | 5 | 5 | 5 | 5 | 0 | 0.0 | 0 |
| 131 | 1 | 0 | 15 | 1 | 1 | 192 | 2 | 4 | 2 | 3 | ... | 5 | 5 | 4 | 5 | 5 | 5 | 5 | 5 | 8.0 | 0 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [52]:
X = age_1.drop(['satisfaction'], axis = 1)
y = age_1['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [53]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[53]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 4 | Gradient Boosting Classifier | 98.46 | 0.21 | 95.23 | 0.29 |
| 3 | Random Forest Classifier | 98.22 | 0.19 | 95.08 | 0.32 |
| 1 | Decision Tree Classifier | 97.69 | 0.35 | 94.47 | 0.17 |
| 2 | Bagging Classifier | 97.42 | 0.33 | 94.60 | 0.37 |
| 0 | Logistic Regression | 81.14 | 0.33 | 86.28 | 1.02 |
| 5 | KNN | 65.50 | 1.14 | 81.78 | 0.85 |
Best model for Age group 1 data = Gradient Boosting Classifier¶
Confusion Matrix¶
In [54]:
y_pred_gbc = gbc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gbc, labels = gbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = gbc.classes_)
disp.plot(cmap = 'flare')
plt.show()
Age group 2: 19 - 24¶
In [55]:
age_2 = airline[(airline['Age'] > 18) & (airline['Age'] <= 24)]
age_2.head()
Out[55]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 1 | 1 | 20 | 0 | 0 | 192 | 2 | 0 | 2 | 4 | ... | 2 | 4 | 1 | 3 | 2 | 2 | 2 | 0 | 0.0 | 0 |
| 26 | 1 | 0 | 24 | 0 | 2 | 3680 | 4 | 1 | 4 | 4 | ... | 2 | 5 | 5 | 5 | 5 | 4 | 2 | 0 | 0.0 | 1 |
| 27 | 0 | 0 | 22 | 0 | 0 | 1521 | 4 | 1 | 1 | 1 | ... | 4 | 1 | 4 | 1 | 1 | 5 | 4 | 3 | 13.0 | 1 |
| 33 | 1 | 0 | 22 | 1 | 0 | 1846 | 4 | 5 | 4 | 4 | ... | 5 | 5 | 4 | 4 | 4 | 4 | 5 | 40 | 68.0 | 0 |
| 41 | 1 | 1 | 24 | 0 | 0 | 725 | 4 | 4 | 4 | 2 | ... | 2 | 4 | 2 | 3 | 1 | 5 | 2 | 0 | 4.0 | 1 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [56]:
X = age_2.drop(['satisfaction'], axis = 1)
y = age_2['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [57]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[57]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 3 | Random Forest Classifier | 98.81 | 0.06 | 94.57 | 0.11 |
| 4 | Gradient Boosting Classifier | 98.80 | 0.08 | 94.15 | 0.17 |
| 2 | Bagging Classifier | 98.13 | 0.27 | 93.42 | 0.41 |
| 1 | Decision Tree Classifier | 96.46 | 0.48 | 93.29 | 0.53 |
| 0 | Logistic Regression | 84.45 | 0.62 | 81.30 | 0.57 |
| 5 | KNN | 68.74 | 0.74 | 68.08 | 0.55 |
Best model for Age group 2 data = Random Forest Classifier¶
Confusion Matrix¶
In [58]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare')
plt.show()
Age group 3: 25 - 34¶
In [59]:
age_3 = airline[(airline['Age'] > 25) & (airline['Age'] <= 34)]
age_3.head()
Out[59]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11 | 0 | 0 | 33 | 0 | 2 | 325 | 2 | 5 | 5 | 5 | ... | 2 | 2 | 2 | 2 | 3 | 2 | 4 | 18 | 7.0 | 0 |
| 16 | 0 | 0 | 31 | 0 | 0 | 728 | 2 | 5 | 5 | 5 | ... | 2 | 4 | 3 | 3 | 4 | 3 | 2 | 2 | 0.0 | 0 |
| 24 | 0 | 1 | 30 | 0 | 0 | 528 | 4 | 3 | 5 | 3 | ... | 2 | 3 | 2 | 3 | 4 | 4 | 2 | 0 | 0.0 | 0 |
| 38 | 0 | 1 | 32 | 0 | 2 | 802 | 4 | 4 | 4 | 2 | ... | 2 | 4 | 2 | 4 | 3 | 5 | 2 | 0 | 10.0 | 0 |
| 43 | 0 | 0 | 28 | 0 | 2 | 1024 | 1 | 1 | 1 | 1 | ... | 4 | 5 | 3 | 5 | 5 | 4 | 4 | 0 | 3.0 | 1 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [60]:
X = age_3.drop(['satisfaction'], axis = 1)
y = age_3['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [61]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[61]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 3 | Random Forest Classifier | 99.17 | 0.04 | 95.69 | 0.07 |
| 4 | Gradient Boosting Classifier | 99.00 | 0.08 | 94.41 | 0.22 |
| 2 | Bagging Classifier | 98.76 | 0.09 | 95.26 | 0.22 |
| 1 | Decision Tree Classifier | 96.40 | 0.27 | 94.32 | 0.25 |
| 0 | Logistic Regression | 88.52 | 0.85 | 83.88 | 0.76 |
| 5 | KNN | 73.09 | 0.30 | 71.11 | 0.33 |
Best model for Age group 3 data = Random Forest Classifier¶
Confusion Matrix¶
In [62]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare')
plt.show()
Age group 4: 35 - 44¶
In [63]:
age_4 = airline[(airline['Age'] > 35) & (airline['Age'] <= 44)]
age_4.head()
Out[63]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0 | 36 | 0 | 2 | 2863 | 1 | 1 | 3 | 1 | ... | 4 | 4 | 4 | 4 | 3 | 4 | 5 | 0 | 0.0 | 1 |
| 3 | 1 | 0 | 44 | 0 | 2 | 3377 | 0 | 0 | 0 | 2 | ... | 1 | 1 | 1 | 1 | 3 | 1 | 4 | 0 | 6.0 | 1 |
| 7 | 0 | 0 | 43 | 0 | 2 | 2556 | 2 | 2 | 2 | 2 | ... | 4 | 4 | 4 | 4 | 5 | 4 | 3 | 77 | 65.0 | 1 |
| 18 | 0 | 0 | 43 | 1 | 0 | 1927 | 3 | 4 | 3 | 1 | ... | 5 | 5 | 3 | 5 | 4 | 5 | 3 | 0 | 0.0 | 0 |
| 21 | 1 | 0 | 43 | 1 | 0 | 1437 | 3 | 4 | 3 | 4 | ... | 2 | 4 | 2 | 4 | 4 | 5 | 2 | 0 | 0.0 | 0 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [64]:
X = age_4.drop(['satisfaction'], axis = 1)
y = age_4['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [65]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[65]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 3 | Random Forest Classifier | 98.85 | 0.07 | 94.97 | 0.23 |
| 4 | Gradient Boosting Classifier | 98.56 | 0.06 | 93.79 | 0.19 |
| 2 | Bagging Classifier | 98.36 | 0.10 | 94.79 | 0.33 |
| 1 | Decision Tree Classifier | 96.65 | 0.21 | 94.15 | 0.15 |
| 0 | Logistic Regression | 90.80 | 0.34 | 82.92 | 0.41 |
| 5 | KNN | 76.41 | 0.44 | 70.89 | 0.42 |
Best model for Age group 4 data = Random Forest Classifier¶
Confusion Matrix¶
In [66]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare')
plt.show()
Age group 5: 45 - 54¶
In [67]:
age_5 = airline[(airline['Age'] > 45) & (airline['Age'] <= 54)]
age_5.head()
Out[67]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 52 | 0 | 0 | 160 | 5 | 4 | 3 | 4 | ... | 5 | 5 | 5 | 5 | 2 | 5 | 5 | 50 | 44.0 | 1 |
| 4 | 0 | 0 | 49 | 0 | 0 | 1182 | 2 | 3 | 4 | 3 | ... | 2 | 2 | 2 | 2 | 4 | 2 | 4 | 0 | 20.0 | 1 |
| 8 | 1 | 0 | 47 | 0 | 0 | 556 | 5 | 2 | 2 | 2 | ... | 5 | 2 | 2 | 5 | 3 | 3 | 5 | 1 | 0.0 | 1 |
| 9 | 0 | 0 | 46 | 0 | 2 | 1744 | 2 | 2 | 2 | 2 | ... | 4 | 4 | 4 | 4 | 5 | 4 | 4 | 28 | 14.0 | 1 |
| 10 | 0 | 0 | 47 | 0 | 0 | 1235 | 4 | 1 | 1 | 1 | ... | 3 | 3 | 4 | 3 | 1 | 3 | 4 | 29 | 19.0 | 1 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [68]:
X = age_5.drop(['satisfaction'], axis = 1)
y = age_5['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [69]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[69]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 3 | Random Forest Classifier | 99.20 | 0.12 | 95.59 | 0.38 |
| 4 | Gradient Boosting Classifier | 98.98 | 0.15 | 94.84 | 0.26 |
| 2 | Bagging Classifier | 98.90 | 0.22 | 95.45 | 0.32 |
| 1 | Decision Tree Classifier | 97.21 | 0.38 | 94.77 | 0.27 |
| 0 | Logistic Regression | 93.29 | 0.58 | 86.20 | 0.68 |
| 5 | KNN | 77.63 | 0.54 | 71.59 | 0.74 |
Best model for Age group 5 data = Random Forest Classifier¶
Confusion Matrix¶
In [70]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare')
plt.show()
Age group 6: 55 - 64¶
In [71]:
age_6 = airline[(airline['Age'] > 55) & (airline['Age'] <= 64)]
age_6.head()
Out[71]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13 | 0 | 0 | 60 | 0 | 2 | 451 | 1 | 1 | 4 | 1 | ... | 5 | 5 | 5 | 5 | 3 | 5 | 5 | 117 | 113.0 | 1 |
| 20 | 1 | 0 | 60 | 0 | 2 | 612 | 4 | 4 | 4 | 4 | ... | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 21 | 49.0 | 1 |
| 25 | 1 | 0 | 62 | 1 | 0 | 710 | 3 | 5 | 3 | 4 | ... | 2 | 3 | 5 | 5 | 4 | 4 | 2 | 0 | 0.0 | 0 |
| 31 | 0 | 0 | 56 | 0 | 0 | 308 | 2 | 3 | 3 | 3 | ... | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0 | 0.0 | 1 |
| 45 | 0 | 0 | 62 | 0 | 0 | 550 | 5 | 1 | 1 | 1 | ... | 5 | 5 | 5 | 5 | 5 | 5 | 2 | 92 | 90.0 | 1 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [72]:
X = age_6.drop(['satisfaction'], axis = 1)
y = age_6['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [73]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[73]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 4 | Gradient Boosting Classifier | 99.19 | 0.12 | 95.49 | 0.40 |
| 3 | Random Forest Classifier | 99.07 | 0.18 | 95.55 | 0.42 |
| 2 | Bagging Classifier | 98.79 | 0.08 | 95.42 | 0.21 |
| 1 | Decision Tree Classifier | 95.98 | 0.11 | 94.45 | 0.22 |
| 0 | Logistic Regression | 93.41 | 0.18 | 87.34 | 0.31 |
| 5 | KNN | 76.55 | 0.06 | 70.95 | 0.24 |
Best model for Age group 6 data = Gradient Boosting Classifier¶
Confusion Matrix¶
In [74]:
y_pred_gbc = gbc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gbc, labels = gbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = gbc.classes_)
disp.plot(cmap = 'flare')
plt.show()
Age group 7: above 65¶
In [75]:
age_7 = airline[(airline['Age'] >= 65)]
age_7.head()
Out[75]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | 0 | 0 | 77 | 0 | 2 | 3987 | 5 | 5 | 5 | 5 | ... | 5 | 5 | 5 | 5 | 4 | 5 | 3 | 0 | 0.0 | 1 |
| 57 | 0 | 0 | 67 | 1 | 0 | 626 | 3 | 4 | 3 | 1 | ... | 3 | 3 | 3 | 3 | 3 | 3 | 5 | 0 | 0.0 | 0 |
| 61 | 0 | 0 | 70 | 1 | 0 | 1829 | 1 | 4 | 1 | 1 | ... | 5 | 5 | 1 | 5 | 4 | 5 | 3 | 0 | 0.0 | 0 |
| 63 | 1 | 0 | 66 | 0 | 0 | 404 | 4 | 1 | 1 | 1 | ... | 4 | 3 | 1 | 3 | 2 | 5 | 4 | 9 | 6.0 | 1 |
| 67 | 0 | 0 | 69 | 1 | 0 | 872 | 2 | 3 | 2 | 2 | ... | 5 | 5 | 2 | 5 | 3 | 5 | 3 | 14 | 0.0 | 0 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [76]:
X = age_7.drop(['satisfaction'], axis = 1)
y = age_7['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [77]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[77]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 4 | Gradient Boosting Classifier | 98.25 | 0.18 | 94.76 | 0.22 |
| 3 | Random Forest Classifier | 97.55 | 0.29 | 94.62 | 0.35 |
| 2 | Bagging Classifier | 96.87 | 0.59 | 94.40 | 0.36 |
| 1 | Decision Tree Classifier | 93.48 | 1.86 | 93.54 | 0.62 |
| 0 | Logistic Regression | 83.89 | 0.92 | 87.46 | 0.42 |
| 5 | KNN | 61.17 | 0.60 | 79.99 | 0.41 |
Best model for Age group 7 data = Gradient Boosting Classifier¶
Confusion Matrix¶
In [78]:
y_pred_gbc = gbc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gbc, labels = gbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = gbc.classes_)
disp.plot(cmap = 'flare')
plt.show()
In [ ]:
In [ ]:
Subsetting data by Type of Travel¶
Business Travel¶
In [79]:
btravel = airline[airline['Type of Travel'] == 0]
btravel.head()
Out[79]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 52 | 0 | 0 | 160 | 5 | 4 | 3 | 4 | ... | 5 | 5 | 5 | 5 | 2 | 5 | 5 | 50 | 44.0 | 1 |
| 1 | 0 | 0 | 36 | 0 | 2 | 2863 | 1 | 1 | 3 | 1 | ... | 4 | 4 | 4 | 4 | 3 | 4 | 5 | 0 | 0.0 | 1 |
| 2 | 1 | 1 | 20 | 0 | 0 | 192 | 2 | 0 | 2 | 4 | ... | 2 | 4 | 1 | 3 | 2 | 2 | 2 | 0 | 0.0 | 0 |
| 3 | 1 | 0 | 44 | 0 | 2 | 3377 | 0 | 0 | 0 | 2 | ... | 1 | 1 | 1 | 1 | 3 | 1 | 4 | 0 | 6.0 | 1 |
| 4 | 0 | 0 | 49 | 0 | 0 | 1182 | 2 | 3 | 4 | 3 | ... | 2 | 2 | 2 | 2 | 4 | 2 | 4 | 0 | 20.0 | 1 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [80]:
X = btravel.drop(['satisfaction'], axis = 1)
y = btravel['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [81]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[81]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 3 | Random Forest Classifier | 99.37 | 0.06 | 96.25 | 0.15 |
| 2 | Bagging Classifier | 98.93 | 0.15 | 95.80 | 0.20 |
| 4 | Gradient Boosting Classifier | 98.75 | 0.08 | 94.26 | 0.22 |
| 1 | Decision Tree Classifier | 95.39 | 0.12 | 94.98 | 0.20 |
| 0 | Logistic Regression | 85.83 | 1.64 | 78.15 | 1.89 |
| 5 | KNN | 77.71 | 0.37 | 71.74 | 0.08 |
Best model for Business Travel data = Random Forest Classifier¶
Confusion Matrix¶
In [82]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare')
plt.show()
Personal Travel¶
In [83]:
ptravel = airline[airline['Type of Travel'] == 1]
ptravel.head()
Out[83]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 15 | 1 | 0 | 50 | 1 | 0 | 83 | 3 | 4 | 0 | 3 | ... | 2 | 4 | 2 | 4 | 4 | 5 | 2 | 5 | 2.0 | 0 |
| 17 | 1 | 0 | 52 | 1 | 1 | 1075 | 5 | 4 | 5 | 3 | ... | 4 | 3 | 5 | 5 | 4 | 5 | 4 | 0 | 0.0 | 1 |
| 18 | 0 | 0 | 43 | 1 | 0 | 1927 | 3 | 4 | 3 | 1 | ... | 5 | 5 | 3 | 5 | 4 | 5 | 3 | 0 | 0.0 | 0 |
| 21 | 1 | 0 | 43 | 1 | 0 | 1437 | 3 | 4 | 3 | 4 | ... | 2 | 4 | 2 | 4 | 4 | 5 | 2 | 0 | 0.0 | 0 |
| 22 | 1 | 0 | 55 | 1 | 0 | 302 | 1 | 2 | 4 | 3 | ... | 4 | 1 | 3 | 2 | 4 | 3 | 4 | 0 | 0.0 | 0 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [84]:
X = ptravel.drop(['satisfaction'], axis = 1)
y = ptravel['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [85]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[85]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 4 | Gradient Boosting Classifier | 97.83 | 0.07 | 95.80 | 0.32 |
| 3 | Random Forest Classifier | 97.64 | 0.06 | 95.72 | 0.34 |
| 2 | Bagging Classifier | 96.55 | 0.25 | 95.32 | 0.24 |
| 1 | Decision Tree Classifier | 79.37 | 1.20 | 95.80 | 0.32 |
| 0 | Logistic Regression | 74.35 | 1.73 | 90.36 | 0.51 |
| 5 | KNN | 58.49 | 0.62 | 89.76 | 0.27 |
Best model for Personal Travel data = Gradient Boosting Classifier¶
Confusion Matrix¶
In [86]:
y_pred_gbc = gbc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gbc, labels = gbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = gbc.classes_)
disp.plot(cmap = 'flare')
plt.show()
In [ ]:
In [ ]:
Subsetting by Customer Type¶
Loyal Customer¶
In [87]:
loyal = airline[airline['Customer Type'] == 0]
loyal.head()
Out[87]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 52 | 0 | 0 | 160 | 5 | 4 | 3 | 4 | ... | 5 | 5 | 5 | 5 | 2 | 5 | 5 | 50 | 44.0 | 1 |
| 1 | 0 | 0 | 36 | 0 | 2 | 2863 | 1 | 1 | 3 | 1 | ... | 4 | 4 | 4 | 4 | 3 | 4 | 5 | 0 | 0.0 | 1 |
| 3 | 1 | 0 | 44 | 0 | 2 | 3377 | 0 | 0 | 0 | 2 | ... | 1 | 1 | 1 | 1 | 3 | 1 | 4 | 0 | 6.0 | 1 |
| 4 | 0 | 0 | 49 | 0 | 0 | 1182 | 2 | 3 | 4 | 3 | ... | 2 | 2 | 2 | 2 | 4 | 2 | 4 | 0 | 20.0 | 1 |
| 5 | 1 | 0 | 16 | 0 | 0 | 311 | 3 | 3 | 3 | 3 | ... | 5 | 4 | 3 | 1 | 1 | 2 | 5 | 0 | 0.0 | 1 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [88]:
X = loyal.drop(['satisfaction'], axis = 1)
y = loyal['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [89]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[89]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 3 | Random Forest Classifier | 99.46 | 0.04 | 96.67 | 0.12 |
| 2 | Bagging Classifier | 99.16 | 0.07 | 96.37 | 0.05 |
| 4 | Gradient Boosting Classifier | 99.12 | 0.06 | 95.17 | 0.16 |
| 1 | Decision Tree Classifier | 97.24 | 0.26 | 95.70 | 0.08 |
| 0 | Logistic Regression | 88.61 | 0.24 | 81.93 | 0.43 |
| 5 | KNN | 79.89 | 0.21 | 73.37 | 0.26 |
Best model for Loyal Customer data = Random Forest Classifier¶
Confusion Matrix¶
In [90]:
y_pred_rand_forest = rand_forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rand_forest, labels = rand_forest_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rand_forest_model.classes_)
disp.plot(cmap = 'flare')
plt.show()
Disloyal Customer¶
In [91]:
disloyal = airline[airline['Customer Type'] == 1]
disloyal.head()
Out[91]:
| Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Inflight wifi service | Departure/Arrival time convenient | Ease of Online booking | Gate location | ... | Inflight entertainment | On-board service | Leg room service | Baggage handling | Checkin service | Inflight service | Cleanliness | Departure Delay in Minutes | Arrival Delay in Minutes | satisfaction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 1 | 1 | 20 | 0 | 0 | 192 | 2 | 0 | 2 | 4 | ... | 2 | 4 | 1 | 3 | 2 | 2 | 2 | 0 | 0.0 | 0 |
| 24 | 0 | 1 | 30 | 0 | 0 | 528 | 4 | 3 | 5 | 3 | ... | 2 | 3 | 2 | 3 | 4 | 4 | 2 | 0 | 0.0 | 0 |
| 32 | 1 | 1 | 41 | 0 | 0 | 624 | 2 | 3 | 2 | 4 | ... | 5 | 4 | 3 | 3 | 1 | 4 | 5 | 0 | 0.0 | 0 |
| 38 | 0 | 1 | 32 | 0 | 2 | 802 | 4 | 4 | 4 | 2 | ... | 2 | 4 | 2 | 4 | 3 | 5 | 2 | 0 | 10.0 | 0 |
| 40 | 0 | 1 | 42 | 0 | 2 | 373 | 3 | 3 | 3 | 5 | ... | 4 | 3 | 2 | 5 | 5 | 5 | 4 | 0 | 0.0 | 0 |
5 rows × 23 columns
Test-Train Split (40:60)¶
In [92]:
X = disloyal.drop(['satisfaction'], axis = 1)
y = disloyal['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
Comparing all the models¶
In [93]:
#log
log_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
#decision tree score
depth_list = np.arange(1, 20, 1)
scores_list = [cross_val_score(DecisionTreeClassifier(max_depth = depth, criterion='gini', random_state = 0),
X_train, y_train, cv = 10, scoring = 'accuracy').mean() for depth in depth_list]
best_depth = max(range(len(scores_list)), key = lambda x: scores_list[x]) + 1
dt_model = DecisionTreeClassifier(max_depth = best_depth, criterion = "gini", random_state = 0).fit(X_train, y_train)
#bagging
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf_bag = BaggingClassifier(n_estimators = 10, random_state = 0).fit(X_train_scaled, y_train)
#random forest
rand_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42).fit(X_train, y_train)
#gradient boosting
gbc = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#knn
knn = KNeighborsClassifier(n_neighbors = 7).fit(X, y)
#for comparing all the models
models = []
models.append(('Logistic Regression', log_model))
models.append(('Decision Tree Classifier', dt_model))
models.append(('Bagging Classifier', clf_bag))
models.append(('Random Forest Classifier', rand_forest_model))
models.append(('Gradient Boosting Classifier', gbc))
models.append(('KNN', knn))
#evaluating model results
acc_results = []
auc_results = []
names = []
#set table to table to populate with perf results
col = ['Algorithm', 'AUC Mean', 'AUC STD', 'Accuracy Mean', 'Accuracy STD']
model_results = pd.DataFrame(columns = col)
#using k-fold cross-validation:
i = 0
for name, model in models:
kfold = model_selection.KFold(n_splits = 3)
# accuracy scoring:
cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')
# roc_auc scoring:
cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'roc_auc')
#append
acc_results.append(cv_acc_results)
auc_results.append(cv_auc_results)
names.append(name)
model_results.loc[i] = [name,
round(cv_auc_results.mean()*100, 2),
round(cv_auc_results.std()*100, 2),
round(cv_acc_results.mean()*100, 2),
round(cv_acc_results.std()*100, 2)
]
i += 1
#results
model_results.sort_values(by = ['AUC Mean'], ascending = False)
Out[93]:
| Algorithm | AUC Mean | AUC STD | Accuracy Mean | Accuracy STD | |
|---|---|---|---|---|---|
| 4 | Gradient Boosting Classifier | 98.17 | 0.08 | 94.03 | 0.09 |
| 3 | Random Forest Classifier | 97.81 | 0.14 | 93.77 | 0.21 |
| 1 | Decision Tree Classifier | 97.47 | 0.15 | 93.84 | 0.16 |
| 2 | Bagging Classifier | 97.12 | 0.09 | 93.12 | 0.28 |
| 0 | Logistic Regression | 80.39 | 0.45 | 83.96 | 0.57 |
| 5 | KNN | 69.45 | 0.12 | 77.83 | 0.17 |
Best model for Disloyal Customer data = Gradient Boosting Classifier¶
Confusion Matrix¶
In [94]:
y_pred_gbc = gbc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gbc, labels = gbc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = gbc.classes_)
disp.plot(cmap = 'flare')
plt.show()
In [ ]:
In [ ]:
Business Decision Performance¶
- We are choosing the model that performs best for each subset and choosing the top 3 most important features (based on the model)
- Increasing the satisfaction by 1 for each of those features and predicting (based on test data) how many passengers that were initially dissatified, [since most of the model accuracies are above 97%, assuming the predictions are true] will be predicted as satisfied
- Then calculating the cost, profit and the cost profit ratio for each
Assumed costs for increasing a variable by 1 unit¶
In [95]:
costs = {
'Flight Distance': 400, 'Inflight wifi service': 80, 'Ease of Online booking': 60,
'Gate location': 140, 'Food and drink': 100, 'Online boarding': 70, 'Seat comfort': 110,
'Inflight entertainment': 50, 'On-board service': 30, 'Leg room service': 500,
'Baggage handling': 20, 'Checkin service': 20, 'Inflight service': 40, 'Cleanliness': 20
}
Assumed profit per changed customer¶
In [96]:
profit_per_change = 500
For Class¶
For Business Class: Random Forest Classifier¶
In [97]:
#business.head()
X = business.drop(['satisfaction'], axis = 1)
y = business['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [98]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [99]:
#recursive feature elimination
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[99]:
['Gender', 'Class', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
In [100]:
#training data again with the removed features
X = business.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = business['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [101]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns
#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [102]:
#intital predictions
initial_predictions = rf.predict(X_test)
In [103]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase = rf.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_business_class = pd.DataFrame(table_data, columns=table_head)
results_business_class["Cost Benefit ($)"] = results_business_class["Total Profit ($)"]/results_business_class["Total Cost ($)"]
results_business_class
Out[103]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Online boarding | 1 | 70 | 150000 | 300 | 2142.857143 |
| 1 | Inflight wifi service | 1 | 80 | 320500 | 641 | 4006.250000 |
| 2 | Inflight entertainment | 1 | 50 | 162000 | 324 | 3240.000000 |
In [104]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
In [105]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_business_class, color='purple')
Out[105]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
For Economy Class: Random Forest Classifier¶
In [106]:
X = economy.drop(['satisfaction'], axis = 1)
y = economy['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [107]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [108]:
#recursive feature elimination
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[108]:
['Gender', 'Class', 'Gate location', 'Food and drink', 'Cleanliness']
In [109]:
#training data again with the removed features
X = economy.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = economy['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [110]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns
#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [111]:
#intital predictions
initial_predictions = rf.predict(X_test)
In [112]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = rf.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_economy_class = pd.DataFrame(table_data, columns=table_head)
results_economy_class["Cost Benefit ($)"] = results_economy_class["Total Profit ($)"]/results_economy_class["Total Cost ($)"]
results_economy_class
Out[112]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Inflight wifi service | 1 | 80 | 500 | 1 | 6.250000 |
| 1 | Online boarding | 1 | 70 | 212500 | 425 | 3035.714286 |
| 2 | Ease of Online booking | 1 | 60 | 154000 | 308 | 2566.666667 |
In [113]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_economy_class, color='purple')
Out[113]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
For Economy Plus Class: Gradient Boosting Classifier¶
In [114]:
X = economy_plus.drop(['satisfaction'], axis = 1) #dropping no common class for non important stuff
y = economy_plus['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [115]:
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42)
gb.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [116]:
#recursive feature elimination
rfe = RFECV(gb,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[116]:
['Gender', 'Class', 'Departure/Arrival time convenient', 'Gate location', 'Food and drink', 'Seat comfort', 'On-board service', 'Leg room service', 'Departure Delay in Minutes']
In [117]:
#training data again with the removed features
X = economy_plus.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = economy_plus['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [118]:
#training model using new infor
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
features = X.columns
#plot
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [119]:
#intital predictions
initial_predictions = gb.predict(X_test)
In [120]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = gb.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_economy_plus_class = pd.DataFrame(table_data, columns=table_head)
results_economy_plus_class["Cost Benefit ($)"] = results_economy_plus_class["Total Profit ($)"]/results_economy_plus_class["Total Cost ($)"]
results_economy_plus_class
Out[120]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Inflight wifi service | 1 | 80 | 0 | 0 | 0.0 |
| 1 | Type of Travel | 1 | 0 | 1000 | 2 | inf |
| 2 | Customer Type | 1 | 0 | 2500 | 5 | inf |
In [121]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_economy_plus_class, color='purple')
Out[121]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
In [ ]:
In [ ]:
For Gender¶
For Female data: Random Forest Classifier¶
In [122]:
X = female.drop(['satisfaction'], axis = 1)
y = female['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [123]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [124]:
#recursive feature elimination
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[124]:
['Gender', 'Food and drink', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
In [125]:
#training data again with the removed features
X = female.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = female['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [126]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns
#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [127]:
#intital predictions
initial_predictions = rf.predict(X_test)
In [128]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = rf.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_female_class = pd.DataFrame(table_data, columns=table_head)
results_female_class["Cost Benefit ($)"] = results_female_class["Total Profit ($)"]/results_female_class["Total Cost ($)"]
results_female_class
Out[128]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Inflight wifi service | 1 | 80 | 150000 | 300 | 1875.000000 |
| 1 | Online boarding | 1 | 70 | 229000 | 458 | 3271.428571 |
| 2 | Type of Travel | 1 | 0 | 39500 | 79 | inf |
In [129]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_female_class, color='purple')
Out[129]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
For Male data: Random Forest Classifier¶
In [130]:
X = male.drop(['satisfaction'], axis = 1)
y = male['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [131]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [132]:
#recursive feature elimination
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[132]:
['Gender', 'Gate location', 'Food and drink', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
In [133]:
#training data again with the removed features
X = male.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = male['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [134]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns
#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [135]:
#intital predictions
initial_predictions = rf.predict(X_test)
In [136]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = rf.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_male_class = pd.DataFrame(table_data, columns=table_head)
results_male_class["Cost Benefit ($)"] = results_male_class["Total Profit ($)"]/results_male_class["Total Cost ($)"]
results_male_class
Out[136]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Online boarding | 1 | 70 | 104500 | 209 | 1492.857143 |
| 1 | Inflight wifi service | 1 | 80 | 207500 | 415 | 2593.750000 |
| 2 | Type of Travel | 1 | 0 | 50000 | 100 | inf |
In [137]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_male_class, color='purple')
Out[137]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
In [ ]:
In [ ]:
For Age¶
For Age group 1(6-18): Gradient Boosting Classifier¶
In [138]:
X = age_1.drop(['satisfaction'], axis = 1)
y = age_1['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [139]:
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42)
gb.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [140]:
#recursive feature elimination
rfe = RFECV(gb,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[140]:
['Gender', 'Inflight entertainment', 'Leg room service']
In [141]:
#training data again with the removed features
X = age_1.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_1['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [142]:
#training model using new infor
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
features = X.columns
#plot
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [143]:
#intital predictions
initial_predictions = gb.predict(X_test)
In [144]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = gb.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_1_class = pd.DataFrame(table_data, columns=table_head)
results_age_1_class["Cost Benefit ($)"] = results_age_1_class["Total Profit ($)"]/results_age_1_class["Total Cost ($)"]
results_age_1_class
Out[144]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Inflight wifi service | 1 | 80 | 7000 | 14 | 87.5 |
| 1 | Online boarding | 1 | 70 | 3500 | 7 | 50.0 |
| 2 | Class | 1 | 0 | 8500 | 17 | inf |
In [145]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_1_class, color='purple')
Out[145]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
For Age group 2(19-24): Random Forest Classifier¶
In [146]:
X = age_2.drop(['satisfaction'], axis = 1)
y = age_2['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [147]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [148]:
#recursive feature elimination
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[148]:
['Gender', 'Customer Type', 'Age', 'Inflight entertainment', 'Leg room service']
In [149]:
#training data again with the removed features
X = age_2.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_2['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [150]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns
#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [151]:
#intital predictions
initial_predictions = rf.predict(X_test)
In [152]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = rf.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_2_class = pd.DataFrame(table_data, columns=table_head)
results_age_2_class["Cost Benefit ($)"] = results_age_2_class["Total Profit ($)"]/results_age_2_class["Total Cost ($)"]
results_age_2_class
Out[152]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Inflight wifi service | 1 | 80 | 30000 | 60 | 375.000000 |
| 1 | Online boarding | 1 | 70 | 60500 | 121 | 864.285714 |
| 2 | Ease of Online booking | 1 | 60 | 70000 | 140 | 1166.666667 |
In [153]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_2_class, color='purple')
Out[153]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
In [ ]:
For Age group 3(25-34): Random Forest Classifier¶
In [154]:
X = age_3.drop(['satisfaction'], axis = 1)
y = age_3['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [155]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [156]:
#recursive feature elimination
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[156]:
['Gender', 'Leg room service', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
In [157]:
#training data again with the removed features
X = age_3.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_3['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [158]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns
#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [159]:
#intital predictions
initial_predictions = rf.predict(X_test)
In [160]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = rf.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_3_class = pd.DataFrame(table_data, columns=table_head)
results_age_3_class["Cost Benefit ($)"] = results_age_3_class["Total Profit ($)"]/results_age_3_class["Total Cost ($)"]
results_age_3_class
Out[160]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Online boarding | 1 | 70 | 49000 | 98 | 700.0 |
| 1 | Inflight wifi service | 1 | 80 | 82000 | 164 | 1025.0 |
| 2 | Class | 1 | 0 | 47500 | 95 | inf |
In [161]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_3_class, color='purple')
Out[161]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
For Age group 4(35-44): Random Forest Classifier¶
In [162]:
X = age_4.drop(['satisfaction'], axis = 1)
y = age_4['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [163]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [164]:
#recursive feature elimination
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[164]:
['Gender', 'Food and drink', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
In [165]:
#training data again with the removed features
X = age_4.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_4['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [166]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns
#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [167]:
#intital predictions
initial_predictions = rf.predict(X_test)
In [168]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = rf.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_4_class = pd.DataFrame(table_data, columns=table_head)
results_age_4_class["Cost Benefit ($)"] = results_age_4_class["Total Profit ($)"]/results_age_4_class["Total Cost ($)"]
results_age_4_class
Out[168]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Online boarding | 1 | 70 | 75000 | 150 | 1071.428571 |
| 1 | Inflight wifi service | 1 | 80 | 71500 | 143 | 893.750000 |
| 2 | Inflight entertainment | 1 | 50 | 58000 | 116 | 1160.000000 |
In [169]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_4_class, color='purple')
Out[169]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
For Age group 5(45-54): Random Forest Classifier¶
In [170]:
X = age_5.drop(['satisfaction'], axis = 1)
y = age_5['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [171]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [172]:
#recursive feature elimination
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[172]:
['Gender', 'Age', 'Gate location', 'Food and drink', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
In [173]:
#training data again with the removed features
X = age_5.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_5['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [174]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns
#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [175]:
#intital predictions
initial_predictions = rf.predict(X_test)
In [176]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = rf.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_5_class = pd.DataFrame(table_data, columns=table_head)
results_age_5_class["Cost Benefit ($)"] = results_age_5_class["Total Profit ($)"]/results_age_5_class["Total Cost ($)"]
results_age_5_class
Out[176]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Type of Travel | 1 | 0 | 11000 | 22 | inf |
| 1 | Class | 1 | 0 | 52000 | 104 | inf |
| 2 | Leg room service | 1 | 500 | 32500 | 65 | 65.0 |
In [177]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_5_class, color='purple')
Out[177]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
For Age group 6(55-64): Gradient Boosting Classifier¶
In [178]:
X = age_6.drop(['satisfaction'], axis = 1)
y = age_6['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [179]:
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [180]:
#recursive feature elimination
rfe = RFECV(gb,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[180]:
['Departure Delay in Minutes']
In [181]:
#training data again with the removed features
X = age_6.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_6['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [182]:
#training model using new infor
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
features = X.columns
#plot
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [183]:
#intital predictions
initial_predictions = gb.predict(X_test)
In [184]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = gb.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_6_class = pd.DataFrame(table_data, columns=table_head)
results_age_6_class["Cost Benefit ($)"] = results_age_6_class["Total Profit ($)"]/results_age_6_class["Total Cost ($)"]
results_age_6_class
Out[184]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Type of Travel | 1 | 0 | 3000 | 6 | inf |
| 1 | Class | 1 | 0 | 38500 | 77 | inf |
| 2 | Inflight wifi service | 1 | 80 | 21500 | 43 | 268.75 |
In [185]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_6_class, color='purple')
Out[185]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
For Age group 7(above 65): Gradient Boosting Classifier¶
In [186]:
X = age_7.drop(['satisfaction'], axis = 1)
y = age_7['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [187]:
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [188]:
#recursive feature elimination
rfe = RFECV(gb,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[188]:
['Gender', 'Departure Delay in Minutes']
In [189]:
#training data again with the removed features
X = age_7.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = age_7['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [190]:
#training model using new infor
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
features = X.columns
#plot
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [191]:
#intital predictions
initial_predictions = gb.predict(X_test)
In [192]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = gb.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_age_7_class = pd.DataFrame(table_data, columns=table_head)
results_age_7_class["Cost Benefit ($)"] = results_age_7_class["Total Profit ($)"]/results_age_7_class["Total Cost ($)"]
results_age_7_class
Out[192]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Inflight wifi service | 1 | 80 | 5000 | 10 | 62.5 |
| 1 | Leg room service | 1 | 500 | 5000 | 10 | 10.0 |
| 2 | Type of Travel | 1 | 0 | 4000 | 8 | inf |
In [193]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_age_7_class, color='purple')
Out[193]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
In [ ]:
In [ ]:
For Type of Travel¶
For Business Travel: Random Forest Classifier¶
In [194]:
X = btravel.drop(['satisfaction'], axis = 1)
y = btravel['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [195]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [196]:
#recursive feature elimination
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[196]:
['Gender', 'Type of Travel', 'Food and drink', 'Departure Delay in Minutes']
In [197]:
#training data again with the removed features
X = btravel.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = btravel['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [198]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns
#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [199]:
#intital predictions
initial_predictions = rf.predict(X_test)
In [200]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = rf.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_btravel = pd.DataFrame(table_data, columns=table_head)
results_btravel["Cost Benefit ($)"] = results_btravel["Total Profit ($)"]/results_btravel["Total Cost ($)"]
results_btravel
Out[200]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Online boarding | 1 | 70 | 347000 | 694 | 4957.142857 |
| 1 | Inflight wifi service | 1 | 80 | 403000 | 806 | 5037.500000 |
| 2 | Inflight entertainment | 1 | 50 | 254000 | 508 | 5080.000000 |
In [201]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_btravel, color='purple')
Out[201]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
For Personal Travel: Gradient Boosting Classifier¶
In [202]:
X = ptravel.drop(['satisfaction'], axis = 1) #dropping no common class for non important stuff
y = ptravel['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [203]:
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42)
gb.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [204]:
#recursive feature elimination
rfe = RFECV(gb,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[204]:
['Gender']
In [205]:
#training data again with the removed features
X = ptravel.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = ptravel['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [206]:
#training model using new infor
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
features = X.columns
#plot
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [207]:
#intital predictions
initial_predictions = gb.predict(X_test)
In [208]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = gb.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_ptravel = pd.DataFrame(table_data, columns=table_head)
results_ptravel["Cost Benefit ($)"] = results_ptravel["Total Profit ($)"]/results_ptravel["Total Cost ($)"]
results_ptravel
Out[208]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Inflight wifi service | 1 | 80 | 0 | 0 | 0.0 |
| 1 | Arrival Delay in Minutes | 1 | 0 | 0 | 0 | NaN |
| 2 | Flight Distance | 1 | 400 | 21000 | 42 | 52.5 |
In [209]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_ptravel, color='purple')
Out[209]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
In [ ]:
In [ ]:
For Type of Customer¶
For Loyal Customer: Random Forest Classifier¶
In [210]:
X = loyal.drop(['satisfaction'], axis = 1)
y = loyal['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [211]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [212]:
#recursive feature elimination
rfe = RFECV(rf,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[212]:
['Gender', 'Customer Type', 'Food and drink', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
In [213]:
#training data again with the removed features
X = loyal.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = loyal['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [214]:
#training model using new infor
rf = RandomForestClassifier(random_state = 0).fit(X_train,y_train)
features = X.columns
#plot
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [215]:
#intital predictions
initial_predictions = rf.predict(X_test)
In [216]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = rf.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_loyal = pd.DataFrame(table_data, columns=table_head)
results_loyal["Cost Benefit ($)"] = results_loyal["Total Profit ($)"]/results_loyal["Total Cost ($)"]
results_loyal
Out[216]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Type of Travel | 1 | 0 | 32000 | 64 | inf |
| 1 | Online boarding | 1 | 70 | 210500 | 421 | 3007.142857 |
| 2 | Inflight wifi service | 1 | 80 | 374500 | 749 | 4681.250000 |
In [217]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_loyal, color='purple')
Out[217]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
For Disloyal Customer: Gradient Boosting Classifier¶
In [218]:
X = disloyal.drop(['satisfaction'], axis = 1) #dropping no common class for non important stuff
y = disloyal['satisfaction']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [219]:
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42)
gb.fit(X_train,y_train)
#checking current feature importances
features = X.columns
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'pink')
plt.show()
In [220]:
#recursive feature elimination
rfe = RFECV(gb,cv = 2,scoring = "neg_mean_squared_error")
rfe.fit(X_train,y_train)
#to check for columns that have been removed
selected_features = np.array(features)[rfe.get_support()]
features = X_test.columns
no_common = [value for value in features if value not in selected_features]
no_common
Out[220]:
['Gender', 'Customer Type', 'Type of Travel', 'Seat comfort', 'Cleanliness']
In [221]:
#training data again with the removed features
X = disloyal.drop(['satisfaction'] + no_common, axis = 1) #dropping no common class for non important stuff
y = disloyal['satisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #40% test
In [222]:
#training model using new infor
gb = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, random_state = 42).fit(X_train, y_train)
features = X.columns
#plot
f_i = list(zip(features,gb.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i], color = 'k')
plt.show()
In [223]:
#intital predictions
initial_predictions = gb.predict(X_test)
In [224]:
#selecting top 3 features that have higher importance in model
top3 = [f[0] for f in sorted(f_i, key=lambda x: x[1], reverse=True)[:3]]
excluded = no_common #exluding columns that are removed with recursive feature elimination
increased_values = 1 #increasing satisfaction by 1
profits = [] #to store profits
table_data = []
for feature in top3:
X_test_increase = X_test.copy()
total_cost = 0
if feature not in excluded:
X_test_increase[feature] = X_test_increase[feature].apply(lambda x: min(x + increased_values, max_value))
cost = costs.get(feature, 0) * increased_values
total_cost += cost
y_pred_increase_new = gb.predict(X_test_increase) #new predictions
changed = sum((y_test == 0) & (y_pred_increase_new == 1)) #to see how many changed from dissatisfied to satisfied
profit = profit_per_change * changed #profit per changed customer
profits.append(profit)
table_data.append([feature, increased_values, total_cost, profit, changed])
table_head = ["Feature", "Increased Value", "Total Cost ($)", "Total Profit ($)", "Number of Changes"]
results_disloyal = pd.DataFrame(table_data, columns=table_head)
results_disloyal["Cost Benefit ($)"] = results_disloyal["Total Profit ($)"]/results_disloyal["Total Cost ($)"]
results_disloyal
Out[224]:
| Feature | Increased Value | Total Cost ($) | Total Profit ($) | Number of Changes | Cost Benefit ($) | |
|---|---|---|---|---|---|---|
| 0 | Inflight wifi service | 1 | 80 | 0 | 0 | 0.0 |
| 1 | Age | 1 | 0 | 185000 | 370 | inf |
| 2 | Class | 1 | 0 | 60000 | 120 | inf |
In [225]:
plt.figure(figsize=(7, 7))
sns.barplot(x='Feature', y='Cost Benefit ($)', data = results_disloyal, color='purple')
Out[225]:
<Axes: xlabel='Feature', ylabel='Cost Benefit ($)'>
In [ ]:
In [ ]:
Other visualizations¶
Total profits over different Classes¶
In [226]:
total_profit_business_class = results_business_class["Total Profit ($)"].sum()
total_profit_economy_class = results_economy_class["Total Profit ($)"].sum()
total__economy_plus_class = results_economy_plus_class["Total Profit ($)"].sum()
profits = [total_profit_business_class, total_profit_economy_class, total__economy_plus_class]
labels = ["Business Class", "Economy Class", "Economy Plus Class"]
plt.figure(figsize=(10, 6))
sns.barplot(x = labels, y = profits, palette = "pink")
plt.ylabel("Total Profit ($)")
plt.show()
Total profits over different Genders¶
In [227]:
total_profit_female = results_female_class["Total Profit ($)"].sum()
total_profit_male = results_male_class["Total Profit ($)"].sum()
profits = [total_profit_female, total_profit_male]
labels = ["Female", "Male"]
plt.figure(figsize=(10, 6))
sns.barplot(x = labels, y = profits, palette = "Reds")
plt.show()
Total profits over different Age groups¶
In [228]:
total_profit_age1 = results_age_1_class["Total Profit ($)"].sum()
total_profit_age2 = results_age_2_class["Total Profit ($)"].sum()
total_profit_age3 = results_age_3_class["Total Profit ($)"].sum()
total_profit_age4 = results_age_4_class["Total Profit ($)"].sum()
total_profit_age5 = results_age_5_class["Total Profit ($)"].sum()
total_profit_age6 = results_age_6_class["Total Profit ($)"].sum()
total_profit_age7 = results_age_7_class["Total Profit ($)"].sum()
profits = [total_profit_age1, total_profit_age2, total_profit_age3, total_profit_age4, total_profit_age5, total_profit_age6,
total_profit_age7]
labels = ["Age group 1", "Age group 2", "Age group 3", "Age group 4", "Age group 5", "Age group 6", "Age group 7"]
plt.figure(figsize=(10, 6))
sns.barplot(x = labels, y = profits, palette = "Purples")
plt.ylabel("Total Profit ($)")
plt.show()
Total profits over different Type of Travel¶
In [230]:
total_profit_btravel = results_btravel["Total Profit ($)"].sum()
total_profit_ptravel = results_ptravel["Total Profit ($)"].sum()
profits = [total_profit_btravel, total_profit_ptravel]
labels = ["Business Travel", "Personal Travel"]
plt.figure(figsize=(10, 6))
sns.barplot(x = labels, y = profits, palette = "Greens")
plt.ylabel("Total Profit ($)")
plt.show()
Total profits over different Type of Travel¶
In [231]:
total_profit_loyal = results_loyal["Total Profit ($)"].sum()
total_profit_disloyal = results_disloyal["Total Profit ($)"].sum()
profits = [total_profit_loyal, total_profit_disloyal]
labels = ["Loyal Customer", "Disoyal Customer"]
plt.figure(figsize=(10, 6))
sns.barplot(x = labels, y = profits, palette = "Blues")
plt.ylabel("Total Profit ($)")
plt.show()
Overall Profits for different Features¶
In [232]:
#combining results
all_results = pd.concat([results_business_class, results_economy_class, results_economy_plus_class,
results_female_class, results_male_class,
results_age_1_class, results_age_2_class, results_age_3_class, results_age_4_class,
results_age_5_class, results_age_6_class, results_age_7_class,
results_btravel, results_ptravel,
results_loyal, results_disloyal])
all_results = all_results.groupby('Feature').sum().reset_index()
In [233]:
plt.figure(figsize=(12, 6))
sns.barplot(x ='Feature', y ='Total Profit ($)', data = all_results, palette = 'crest')
plt.xticks(rotation=45, ha ='right')
plt.legend()
plt.tight_layout()
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
Overall Costs Benefit for different Features¶
In [234]:
plt.figure(figsize=(12, 6))
sns.barplot(x ='Feature', y ='Total Cost ($)', data = all_results, palette = 'coolwarm')
plt.xticks(rotation=45, ha ='right')
plt.legend()
plt.tight_layout()
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
In [235]:
#combining results
all_results = pd.concat([results_business_class, results_economy_class, results_economy_plus_class,
results_female_class, results_male_class,
results_age_1_class, results_age_2_class, results_age_3_class, results_age_4_class,
results_age_5_class, results_age_6_class, results_age_7_class,
results_btravel, results_ptravel,
results_loyal, results_disloyal])
all_results = all_results.groupby('Feature').sum().reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(x ='Feature', y ='Cost Benefit ($)', data = all_results, palette = 'rocket')
plt.xticks(rotation=45, ha ='right')
plt.legend()
plt.tight_layout()
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
In [ ]:
In [ ]:
In [ ]:
In [ ]: